Suffix Automaton Algorithm

The Suffix Automaton Algorithm is a highly efficient data structure used for processing and analyzing strings. It is a deterministic finite automaton (DFA) that represents all the suffixes of a given input string in a compact and compressed manner. The algorithm is capable of solving a wide range of string-related problems, such as pattern matching, substring occurrences, and longest common substrings, in linear time complexity. The key idea behind the algorithm is to build a directed acyclic graph (DAG) with nodes representing states and edges representing transitions between states based on characters in the input string. Each state in the automaton corresponds to a unique set of suffixes of the input string, and the automaton as a whole represents all possible suffixes. Constructing the suffix automaton involves adding characters one by one from the input string and updating the states and transitions accordingly. The algorithm leverages the concept of suffix links, which are pointers that link states representing similar suffixes, to optimize the construction process and ensure linear time complexity. Once the automaton is built, it can be used for various string processing tasks by traversing the graph structure and following the appropriate transitions. For instance, to find the longest common substring between two strings, one can traverse the automaton with the second string and maintain a count of the longest path followed. The suffix automaton algorithm offers significant advantages over other string processing techniques, such as suffix trees, in terms of space complexity and ease of implementation, making it a powerful tool for tackling complex string-related problems.
/*
 Petar 'PetarV' Velickovic
 Data Structure: Suffix Automaton
*/

#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <string.h>
#include <time.h>
#include <iostream>
#include <vector>
#include <list>
#include <string>
#include <algorithm>
#include <queue>
#include <stack>
#include <set>
#include <map>
#include <complex>
#include <chrono>
#include <random>
#include <unordered_map>

#define MAX_N 250001
#define MAX_S 1000010
#define MAX_K 26

typedef long long lld;
typedef unsigned long long llu;
using namespace std;

/*
 The suffix automaton is a minimal deterministic finite automaton (DFA) accepting all
 suffices of a given string. It is useful for a variety of actions, one of which is efficiently
 computing the longest common substring (LCS) between two strings.
 Complexity:    O(n) for constructing the automaton (character by character)
                O(m) for computing the LCS
*/

char inp[MAX_N];

struct Node
{
    int len, prev;
    int adj[MAX_K];
};

Node DFA[MAX_S];
int tot = 0, sink;

inline void init_automaton()
{
    DFA[0].len = 0;
    DFA[0].prev = -1;
    for (int i=0;i<26;i++) DFA[0].adj[i] = -1;
    sink = 0;
    tot = 1;
}

inline void extend_automaton(char ch)
{
    int cc = ch - 'a';
    
    int tail = tot++;
    DFA[tail].len = DFA[sink].len + 1;
    for (int i=0;i<26;i++) DFA[tail].adj[i] = -1;
    
    int curr = sink;
    while (curr != -1 && DFA[curr].adj[cc] == -1)
    {
        DFA[curr].adj[cc] = tail;
        curr = DFA[curr].prev;
    }
    
    if (curr == -1)
    {
        DFA[tail].prev = 0;
    }
    else
    {
        int nxt = DFA[curr].adj[cc];
        if (DFA[nxt].len == DFA[curr].len + 1)
        {
            DFA[tail].prev = nxt;
        }
        else
        {
            int cl = tot++;
            DFA[cl].len = DFA[curr].len + 1;
            DFA[cl].prev = DFA[nxt].prev;
            for (int i=0;i<26;i++) DFA[cl].adj[i] = DFA[nxt].adj[i];
            
            while (curr != -1 && DFA[curr].adj[cc] == nxt)
            {
                DFA[curr].adj[cc] = cl;
                curr = DFA[curr].prev;
            }
            
            DFA[tail].prev = DFA[nxt].prev = cl;
        }
    }
    sink = tail;
}

inline string lcs(string needle)
{
    int curr_len = 0;
    int best_len = 0, best_pos = -1;
    int st = 0;
    
    for (int i=0;i<needle.length();i++)
    {
        char cc = needle[i] - 'a';
        if (DFA[st].adj[cc] == -1)
        {
            while (st != -1 && DFA[st].adj[cc] == -1)
            {
                st = DFA[st].prev;
            }
            if (st == -1)
            {
                st = 0, curr_len = 0;
                continue;
            }
            curr_len = DFA[st].len;
        }
        curr_len++;
        if (best_len < curr_len)
        {
            best_len = curr_len;
            best_pos = i;
        }
        st = DFA[st].adj[cc];
    }
    return needle.substr(best_pos - best_len + 1, best_len);
}

int main()
{
    string s1 = "alsdfkjfjkdsal";
    string s2 = "fdjskalajfkdsla";
    
    init_automaton();
    for (int i=0;i<s1.length();i++) extend_automaton(s1[i]);
    
    cout << lcs(s2) << endl;
    
    return 0;
}

LANGUAGE:

DARK MODE: